#Amazon Prime Movies and TV link to kaggle: https://www.kaggle.com/datasets/dgoenrique/amazon-prime-movies-and-tv-shows

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.2
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
titles <- read.csv("/Users/leahboger/Desktop/amazon prime/titles.csv")
credits <- read.csv("/Users/leahboger/Desktop/amazon prime/credits.csv")

Q1) Which shows/movies have the largest difference in imdb score vs tmdb score, and what is that difference? A: Zone Drifter has the largest difference in scores (7.7 pt difference). After looking at the amount of votes/ popularity of the top five largest discrepancies, it seems that the larger discrepancies are very unpopular films, so there is most likely more variance in the ratings.

titles %>% mutate(score_diff = abs(imdb_score - tmdb_score)) %>% arrange(-score_diff) %>% select(title, score_diff, imdb_score, imdb_votes, tmdb_score, tmdb_popularity) %>% head(5)
##                                                           title score_diff
## 1                                                  Zone Drifter        7.7
## 2 The 1975 'At Their Very Best' Live from Madison Square Garden        7.7
## 3                                                Lockdown: 2025        7.6
## 4                                            Warriors from Hell        7.4
## 5                                   Milgram and the Fastwalkers        7.4
##   imdb_score imdb_votes tmdb_score tmdb_popularity
## 1        2.3        145         10           1.553
## 2        9.7         10          2           4.569
## 3        2.4        247         10           2.996
## 4        2.6         41         10           0.990
## 5        2.6         27         10           0.827

Q2) Create a graph to show how the average length of movies changed over time.

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(mdsr)
avg_time_per_year <- titles %>% filter(type == "MOVIE") %>% group_by(release_year) %>% summarize(avg_length = mean(runtime))

static_plot <- avg_time_per_year %>% ggplot(aes(x = release_year, y = avg_length)) + geom_line() + xlab("Release Year") + ylab("Average Movie Length")

ggplotly(static_plot)

Q3) Make a graph to explore a possible correlation between runtime and average rating(the average of the imdb and tmdb scores).

#incorrect data point removal
titles %>% filter(runtime == 940) %>% select(title)
##         title
## 1 Once Bitten
#after checking amazon.com, this movie is 93 minutes, below this data point will be fixed
titles %>% filter(runtime == 328) %>% select(title)
##                 title
## 1 Custer's Last Stand
#after checking amazon.com, this movie is 88 minutes, below this data point will be fixed

corrected_outliers <- titles %>% mutate(runtime = case_when(title == "Once Bitten" & runtime == 940 ~ 93, title == "Custer's Last Stand" & runtime == 328 ~ 88, TRUE ~ runtime))




avg_rating <- corrected_outliers %>% mutate(avg_rating = (imdb_score + tmdb_score)/2) %>% select(type, avg_rating, runtime)

static_plot2 <- avg_rating %>% ggplot(aes(x = runtime, y = avg_rating)) + geom_point() + facet_wrap( ~ type) + geom_smooth() + ylab("Average Rating") + xlab("Runtime") + xlim(0,350)

ggplotly(static_plot2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2666 rows containing non-finite values (`stat_smooth()`).

Q4) What actor has been in the most TV shows + Movies and which TV shows and movies were they in? What director? A: The actor in the most media is George ‘Gabby’ Hayes, being in 53 movies. The Director in the most media is Joesph Kane, working on 45 movies

#actor
top_actor <- credits %>% filter(role == "ACTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(person_id)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
top_actor_name <- credits %>% filter(role == "ACTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(name)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
ls_films_w_actor <- credits %>% filter(person_id == top_actor) %>% group_by(id) %>% summarize(id = id)

filmsact <- ls_films_w_actor %>% left_join(titles, by = c("id")) %>% select(title, type, seasons, imdb_score)

#top actor
top_actor_name
## [1] "George 'Gabby' Hayes"
filmsact 
## # A tibble: 53 × 4
##    title                     type  seasons imdb_score
##    <chr>                     <chr>   <dbl>      <dbl>
##  1 The Lucky Texan           MOVIE      NA        5.6
##  2 Blue Steel                MOVIE      NA        5.3
##  3 The Star Packer           MOVIE      NA        5.1
##  4 Riders of Destiny         MOVIE      NA        5.4
##  5 The Phantom Broadcast     MOVIE      NA        5.6
##  6 The Return of Casey Jones MOVIE      NA        5.1
##  7 Skyway                    MOVIE      NA        5.4
##  8 The Sphinx                MOVIE      NA        5.6
##  9 'Neath the Arizona Skies  MOVIE      NA        5.1
## 10 In Old Caliente           MOVIE      NA        5.6
## # ℹ 43 more rows
nrow(filmsact)
## [1] 53
unique(filmsact$type)
## [1] "MOVIE"
#director
top_director <- credits %>% filter(role == "DIRECTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(person_id)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
top_director_name <- credits %>% filter(role == "DIRECTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(name)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
ls_films_w_director <- credits %>% filter(person_id == top_director) %>% group_by(id) %>% summarize(id = id)

filmsdir <- ls_films_w_director %>% left_join(titles, by = c("id")) %>% select(title, type, seasons, imdb_score)

#top director

top_director_name
## [1] "Joseph Kane"
filmsdir
## # A tibble: 45 × 4
##    title                   type  seasons imdb_score
##    <chr>                   <chr>   <dbl>      <dbl>
##  1 Man from Music Mountain MOVIE      NA        6.2
##  2 Ghost-Town Gold         MOVIE      NA        6.1
##  3 Song of Texas           MOVIE      NA        5.9
##  4 The Old Corral          MOVIE      NA        5.7
##  5 Idaho                   MOVIE      NA        6  
##  6 Frontier Pony Express   MOVIE      NA        6.1
##  7 In Old Caliente         MOVIE      NA        5.6
##  8 Wall Street Cowboy      MOVIE      NA        5.8
##  9 Rough Riders' Round-up  MOVIE      NA        5.4
## 10 Southward Ho!           MOVIE      NA        6.2
## # ℹ 35 more rows
nrow(filmsdir)
## [1] 45
unique(filmsdir$type)
## [1] "MOVIE"

Q5) Graph the number of productions a person worked on versus the average imdb score those productions have. Facet by role in the media.

number_films <- credits %>% group_by(person_id) %>% summarize(num_film = length(unique(id)))


person_w_id <- credits %>% group_by(person_id, id, role) %>% summarize(id = id) %>% select(person_id, id, role)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'person_id', 'id', 'role'. You can override
## using the `.groups` argument.
imdb_w_filmnum <- person_w_id %>% left_join(titles, by = c("id")) %>% group_by(person_id, role) %>% summarize(avg_imdb = mean(imdb_score)) %>% left_join(number_films, by = c("person_id"))
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
imdb_w_filmnum %>% ggplot(aes(x= num_film, y = avg_imdb)) + geom_point() + facet_wrap( ~ role) + xlab("Number of Productions Worked On") + ylab("Average IMDB Score")
## Warning: Removed 6307 rows containing missing values (`geom_point()`).